# This script creates multiple datasets for training the newborn MRS (nMRS) only model, with different degrees of oversampling..
# Oversampling performed using ADASYN
# Python version 3.6.8 is used

# Imports
import os
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.over_sampling import ADASYN

# Set working directory
os.chdir("/../../")

#######################
### Import datasets ###
#######################
# Load nMRS data - data found in IOWBC_MRS_data.xlsx, sheet: IOWBC_nMRS
data = pd.read_csv("Newborn_MRS_Score1_data.csv", index_col=False)

# drop IDs without an Asthma_10YR outcome
data1 = data.dropna()
Counter(data1.Asthma_10YR)
# 747 IDs - 623 controls, 124 cases

# Split newborn MRS dataset into a training and test set
	# The nMRS test set will contain individuals who are present in either the early life or preschool test sets - these IDs were identified from the test sets used to develop the clinical ML models:
	#	Early_life_test_dataset_255IDs.csv - data found in IOWBC_training_test_data.xlsx, sheet: Early life test set
	#	Preschool_test_dataset_183IDs.csv - data found in IOWBC_training_test_data.xlsx, sheet: Preschool test set
		
earlylife_test = pd.read_csv("Early_life_test_dataset_255IDs.csv", index_col=False)
del earlylife_test['Unnamed: 0']
# 255 IDs
preschool_test = pd.read_csv("Preschool_test_dataset_183IDs.csv", index_col=False)
del preschool_test['Unnamed: 0']
# 183 IDs

# Format study IDs to enable cross-referencing with methylation data IDs
earlylife_test['Study_ID'] = earlylife_test['Study_ID'].astype(str) + '_GU'
earlylife_test['Study_ID'] = earlylife_test['Study_ID'].apply(lambda x: '{0:0>7}'.format(x))

preschool_test['Study_ID'] = preschool_test['Study_ID'].astype(str) + '_GU'
preschool_test['Study_ID'] = preschool_test['Study_ID'].apply(lambda x: '{0:0>7}'.format(x))


# Subset ids in initial early life /preschool test sets into nMRS test set
MRS_test = data1[(data1.Study_ID.isin(earlylife_test.Study_ID)) | (data1.Study_ID.isin(preschool_test.Study_ID))]
# 239 IDs - 205 controls, 34 cases (14.2%)

MRS_train = data1[(~data1.Study_ID.isin(earlylife_test.Study_ID)) & (~data1.Study_ID.isin(preschool_test.Study_ID))]
# 508 IDs - 418 controls, 90 cases (17.7%)

# Save nMRS training and test datasets
MRS_test.to_csv("/Newborn_MRS/MRSonly_model_239ID_test_dataset.csv") - data found in IOWBC_MRS_data.xlsx, sheet: nMRS test
MRS_train.to_csv("/Newborn_MRS/MRSonly_model_508ID_training_dataset.csv") - data found in IOWBC_MRS_data.xlsx, sheet: nMRS training

# Standardise nMRS training and test datasets and save
scaler = StandardScaler()
x = MRS_train.copy()
x = x[['score']]
MRSsc_train = pd.DataFrame(scaler.fit_transform(x.iloc[:,:]))
MRSsc_train.columns=['MRS_scaled']
MRS_SXY_train = pd.concat([MRS_train.reset_index(drop=True), MRSsc_train], axis=1)
MRS_SXY_train = MRS_SXY_train[['Study_ID','MRS_scaled','Asthma_10YR']]

MRS_SXY_train.to_csv("../Newborn_MRS/MRSonly_model_508ID_standardised_training_dataset.csv") - data found in IOWBC_MRS_data.xlsx, sheet: nMRS standardised training

Counter(MRS_SXY_train.Asthma_10YR)
# Controls: 538, Cases: 103}) ~ 16.1% asthmatic

y = MRS_test.copy()
y = y[['score']]
MRSsc_test = pd.DataFrame(scaler.transform(y.iloc[:,:]))
MRSsc_test.columns=['MRS_scaled']
MRS_SXY_test = pd.concat([MRS_test.reset_index(drop=True), MRSsc_test], axis=1)
MRS_SXY_test = MRS_SXY_test[['Study_ID','MRS_scaled','Asthma_10YR']]
Counter(MRS_SXY_test.Asthma_10YR)

MRS_SXY_test.to_csv("../Newborn_MRS/MRSonly_model_239ID_standardised_test_dataset.csv") - data found in IOWBC_MRS_data.xlsx, sheet: nMRS standardised test


######################################
### Construct oversampled datasets ###
######################################
# Oversample +/- undersample the dataset
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling

# Oversampling is done on the standardised training dataset. 
SX_train = MRS_SXY_train.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = MRS_SXY_train['Asthma_10YR']
#Counter({0.0: 418, 1.0: 90})
# Save training set IDs
Train_IDs = MRS_SXY_train.iloc[:,0].to_frame()


### Oversample cases by 25% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(128/418), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 418, 1: 124})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_25_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_25_train.columns = ['Study_ID', 'MRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_25 = pd.concat([Oversampled_25_train, Oy_train_df], axis=1)
Oversampled_25.to_csv("MRS_only_standardised_oversampled_training_dataset_25%.csv", index=False)


### Oversample cases by 50% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(138/418), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 418, 1: 167})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_50_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_50_train.columns = ['Study_ID', 'MRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_50 = pd.concat([Oversampled_50_train, Oy_train_df], axis=1)
Oversampled_50.to_csv("MRS_only_standardised_oversampled_training_dataset_50%.csv", index=False)


### Oversample cases by 100% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(185/418), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 418, 1: 180})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_100_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_100_train.columns = ['Study_ID', 'MRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_100 = pd.concat([Oversampled_100_train, Oy_train_df], axis=1)
Oversampled_100.to_csv("MRS_only_standardised_oversampled_training_dataset_100%.csv")


### Oversample cases by 150% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(232/418), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 418, 1: 257})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_150_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_150_train.columns = ['Study_ID', 'MRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_150 = pd.concat([Oversampled_150_train, Oy_train_df], axis=1)
Oversampled_150.to_csv("MRS_only_standardised_oversampled_training_dataset_150%.csv")


### Oversample cases by 200% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(280/418), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 418, 1: 301})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_200_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_200_train.columns = ['Study_ID', 'MRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_200 = pd.concat([Oversampled_200_train, Oy_train_df], axis=1)
Oversampled_200.to_csv("MRS_only_standardised_oversampled_training_dataset_200%.csv")


### Oversample cases by 250% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(327/418), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 418, 1: 344})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_250_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_250_train.columns = ['Study_ID', 'MRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_250 = pd.concat([Oversampled_250_train, Oy_train_df], axis=1)
Oversampled_250.to_csv("MRS_only_standardised_oversampled_training_dataset_250%.csv")


### Oversample cases by 300% ###
OSX_train, Oy_train = ADASYN(sampling_strategy=(360/418), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
# Original dataset shape Counter({0: 418, 1: 378})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic dataset
# Continuous variables rounded to 6dp
OSX_train_df = OSX_train_df.round(6)
# Add IDs and outcome
Oversampled_300_train = pd.concat([Train_IDs.reset_index(drop=True), OSX_train_df], axis=1)
Oversampled_300_train.columns = ['Study_ID', 'MRS']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_300 = pd.concat([Oversampled_300_train, Oy_train_df], axis=1)
Oversampled_300.to_csv("MRS_only_standardised_oversampled_training_dataset_300%.csv")